OLS Analysis Using full PSU dataset


In [1]:
#Import required packages
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std

from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose

In [2]:
def format_date(df_date):
    """
    Splits Meeting Times and Dates into datetime objects where applicable using regex.
    """
    df_date['Days'] = df_date['Meeting_Times'].str.extract('([^\s]+)', expand=True)
    df_date['Start_Date'] = df_date['Meeting_Dates'].str.extract('([^\s]+)', expand=True)
    df_date['Year'] = df_date['Term'].astype(str).str.slice(0,4)
    df_date['Quarter'] = df_date['Term'].astype(str).str.slice(4,6)
    df_date['Term_Date'] = pd.to_datetime(df_date['Year'] + df_date['Quarter'], format='%Y%m')
    #df_date['Start_Month'] = pd.to_datetime(df_date['Year'] + df_date['Start_Date'], format='%Y%b')
    df_date['End_Date'] = df_date['Meeting_Dates'].str.extract('(?<=-)(.*)(?= )', expand=True)
    #df_date['End_Month'] = pd.to_datetime(df_date['End_Date'], format='%b')
    df_date['Start_Time'] = df_date['Meeting_Times'].str.extract('(?<= )(.*)(?=-)', expand=True)
    df_date['Start_Time'] = pd.to_datetime(df_date['Start_Time'], format='%H%M')
    df_date['End_Time'] = df_date['Meeting_Times'].str.extract('((?<=-).*$)', expand=True)
    df_date['End_Time'] = pd.to_datetime(df_date['End_Time'], format='%H%M')
    df_date['Duration_Hr'] = ((df_date['End_Time'] - df_date['Start_Time']).dt.seconds)/3600
    #df_date = df_date.set_index(pd.DatetimeIndex(df_date['Term_Date']))
    return df_date

def format_xlist(df_xl):
    """
    revises % capacity calculations by using Max Enrollment instead of room capacity.  
    """
    df_xl['Cap_Diff'] = np.where(df_xl['Xlst'] != '', 
                                   df_xl['Max_Enrl'].astype(int) - df_xl['Actual_Enrl'].astype(int), 
                                   df_xl['Room_Capacity'].astype(int) - df_xl['Actual_Enrl'].astype(int)) 
    df_xl = df_xl.loc[df_xl['Room_Capacity'].astype(int) < 999]

    return df_xl

In [3]:
"""
Main program control flow.
"""
#pd.set_option('display.max_rows', None)
#pd.set_option('display.max_columns', None)
df = pd.read_csv('data/PSU_master_classroom.csv', dtype={'Schedule': object, 'Schedule Desc': object})
df = df.fillna('')

df = format_date(df)
# Avoid classes that only occur on a single day
df = df.loc[df['Start_Date'] != df['End_Date']]
df = df.loc[df['Online Instruct Method'] != 'Fully Online']

# Calculate number of days per week and treat Sunday condition
df['Days_Per_Week'] = df['Days'].str.len()
df['Room_Capacity'] = df['Room_Capacity'].apply(lambda x: x if (x != 'No Data Available') else 0)

df_cl = format_xlist(df)

In [4]:
# Map and Enumerate

from sklearn.preprocessing import LabelEncoder

cat_columns = ['Dept', 'Class', 'Meeting_Times', 'ROOM' ]

for column in cat_columns:
    col_mapping = {label: idx for idx, label in enumerate(np.unique(df_cl['{0}'.format(column)]))}
    df_cl['{0}'.format(column)] = df_cl['{0}'.format(column)].map(col_mapping)


/home/kevin/anaconda3/envs/data-science/lib/python3.5/site-packages/ipykernel/__main__.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [5]:
X = df_cl[['Dept', 'Term', 'Class', 'Meeting_Times', 'ROOM']].values
df_cl_le = LabelEncoder()
X[:, 0] = df_cl_le.fit_transform(X[:, 0])
X


Out[5]:
array([[    18, 201302,   4253,   1670,     15],
       [    18, 201302,   4263,   1178,     15],
       [    18, 201302,   4263,   1178,     16],
       ..., 
       [    29, 201301,   4364,   1716,    478],
       [    53, 201301,   3719,   1751,    478],
       [     7, 201301,    640,   1558,    478]])

In [6]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(categorical_features=[0])
ohe.fit_transform(X).toarray()
pd.get_dummies(df[['Dept', 'Term', 'Class', 'Meeting_Times', 'ROOM']])


Out[6]:
Term Dept_0 Dept_ACTG Dept_ANTH Dept_ARCH Dept_ART Dept_BA Dept_BI Dept_BST Dept_CCJ ... ROOM_UTS 308 ROOM_UTS 310 ROOM_XSB 101 ROOM_XSB 183 ROOM_XSB 201 ROOM_XSB 205 ROOM_XSB 259 ROOM_XSB 261 ROOM_XSB 291 ROOM_XSB 293
0 201302 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 201302 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 201302 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 201504 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 201401 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
5 201402 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
8 201302 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
9 201401 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
10 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
11 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
12 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
13 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
14 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
15 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
16 201302 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
17 201302 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
18 201302 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
19 201302 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
20 201303 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
21 201604 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
22 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23 201302 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
39 201302 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
40 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41 201304 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
42 201302 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
43 201601 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
44 201502 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
45 201502 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
46 201502 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
52418 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0
52419 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0
52420 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
52421 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
52422 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
52423 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
52424 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
52425 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
52426 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
52427 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
52429 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
52430 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
52431 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
52432 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
52433 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
52434 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
52435 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
52436 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
52437 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
52438 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
52439 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
52440 201301 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 1
52441 201301 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 1
52442 201301 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 1
52443 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
52444 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
52445 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
52446 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
52447 201301 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
52448 201301 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 1

42186 rows × 8414 columns